import scrapy
from scrapy.http import request
from items import AirspiderItem
import requests
from lxml import etree

Any = False  # 这里调整位True开启全部爬取


def AnyCity():
    resp = requests.get("http://www.tianqihoubao.com/aqi/")
    resp.encoding = "gbk"
    html = etree.HTML(resp.text)

    dist = {

    }

    citychk = html.xpath('//div[@class="citychk"]/dl')[1:]
    for dt in citychk:
        a = dt.xpath("./dd/a")
        for i in a:
            url = i.xpath('./@href')[0]
            city = i.xpath('./text()')[0]
            url = "http://www.tianqihoubao.com" + url.replace(".", "-{}.")
            dist[city] = url

    return dist


class AirSpider(scrapy.Spider):
    name = 'air'
    allowed_domains = ['http://www.tianqihoubao.com/']

    def start_requests(self):

        if Any:
            cites = AnyCity()
        else:
            cites = {
                "北京": 'http://www.tianqihoubao.com/aqi/beijing-{}.html',
                "天津": "http://www.tianqihoubao.com/aqi/tianjin-{}.html",
                "上海": "http://www.tianqihoubao.com/aqi/shanghai-{}.html",
                "重庆": "http://www.tianqihoubao.com/aqi/chongqing-{}.html",
                "广州": "http://www.tianqihoubao.com/aqi/guangzhou-{}.html",
                "深圳": "http://www.tianqihoubao.com/aqi/shenzhen-{}.html",
            }
        for year in range(2018, 2021):
            for month in range(1, 13):
                if month >= 10:
                    date = "{}{}"
                else:
                    date = "{}0{}"

                date = date.format(year, month)

                for city, url in cites.items():
                    url = url.format(date)
                    yield scrapy.Request(url=url, callback=self.parse, meta={
                        "city": city,
                    })

    def parse(self, response):
        city = response.meta["city"]
        api_month_list = response.xpath('//div[@class="api_month_list"]//tr')[1:]
        for tr in api_month_list:
            td = tr.xpath('./td/text()')
            if len(td) == 10:  # 排除错误
                date = td[0].get().strip()
                aqi = td[2].get().strip()
                pm25 = td[4].get().strip()
                pm10 = td[5].get().strip()
                so2 = td[6].get().strip()
                no2 = td[7].get().strip()
                co = td[8].get().strip()
                o3 = td[9].get().strip()
                yield AirspiderItem(city=city, date=date, aqi=aqi, pm25=pm25, pm10=pm10, so2=so2, no2=no2, co=co, o3=o3)
